{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "405a3384-0d9b-4941-8200-59c99c2637dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests \n",
    "\n",
    "docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'\n",
    "docs_response = requests.get(docs_url)\n",
    "documents_raw = docs_response.json()\n",
    "\n",
    "documents = []\n",
    "\n",
    "for course in documents_raw:\n",
    "    course_name = course['course']\n",
    "\n",
    "    for doc in course['documents']:\n",
    "        doc['course'] = course_name\n",
    "        documents.append(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "50368b29-f5a2-4c73-b52f-798e38973242",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'name': '8231f8ecba96', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'w1BA6SjmTMibHoa4tHuOOw', 'version': {'number': '8.17.6', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'dbcbbbd0bc4924cfeb28929dc05d82d662c527b7', 'build_date': '2025-04-30T14:07:12.231372970Z', 'build_snapshot': False, 'lucene_version': '9.12.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "es_client = Elasticsearch('http://localhost:9200')\n",
    "es_client.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "df6c8319-3d03-48f0-b6f3-58c7a9c67fe4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "index_settings = {\n",
    "    \"settings\": {\n",
    "        \"number_of_shards\": 1,\n",
    "        \"number_of_replicas\": 0\n",
    "    },\n",
    "    \"mappings\": {\n",
    "        \"properties\": {\n",
    "            \"text\": {\"type\": \"text\"},\n",
    "            \"section\": {\"type\": \"text\"},\n",
    "            \"question\": {\"type\": \"text\"},\n",
    "            \"course\": {\"type\": \"keyword\"} \n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "index_name = \"course-questions\"\n",
    "\n",
    "es_client.indices.create(index=index_name, body=index_settings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d2b1be47-06e4-4cd5-a6ec-cf92e9aa805e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm.auto import tqdm "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "eb40249b-9848-4a74-bc63-b652b1d3b8c5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1b8812f352ee4adfbd0a27204c6219d1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/948 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for doc in tqdm(documents):\n",
    "    es_client.index(index=index_name, document=doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "7fab4329-8230-4aca-b233-791442016072",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"How do execute a command on a Kubernetes pod?\"\n",
    "\n",
    "search_query = {\n",
    "    \"size\": 5,\n",
    "    \"query\": {\n",
    "        \"bool\": {\n",
    "            \"must\": {\n",
    "                \"multi_match\": {\n",
    "                    \"query\": query,\n",
    "                    \"fields\": [\"question^4\", \"text\"],\n",
    "                    \"type\": \"best_fields\"\n",
    "                }\n",
    "            },\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "search_results = es_client.search(index=index_name, body=search_query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8ef01df1-283a-4140-aa05-bb0c0be1e58a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "44.50556"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "search_results['hits']['hits'][0]['_score']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "5b26c1ca-b921-4886-8b52-432017d4a8bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"How do copy a file to a Docker container?\"\n",
    "\n",
    "search_query = {\n",
    "    \"size\": 3,\n",
    "    \"query\": {\n",
    "        \"bool\": {\n",
    "            \"must\": {\n",
    "                \"multi_match\": {\n",
    "                    \"query\": query,\n",
    "                    \"fields\": [\"question^4\", \"text\"],\n",
    "                    \"type\": \"best_fields\"\n",
    "                }\n",
    "            },\n",
    "            \"filter\": {\n",
    "                \"term\": {\n",
    "                    \"course\": \"machine-learning-zoomcamp\"\n",
    "                }\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "search_results = es_client.search(index=index_name, body=search_query)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "377acaea-3752-43ce-b1d2-88845e3897b4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'_index': 'course-questions',\n",
       "  '_id': 'm3TAEZcBPE51qpNEuT90',\n",
       "  '_score': 73.38676,\n",
       "  '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\\ndocker run -it --entrypoint bash <image>\\nIf the container is already running, execute a command in the specific container:\\ndocker ps (find the container-id)\\ndocker exec -it <container-id> bash\\n(Marcos MJD)',\n",
       "   'section': '5. Deploying Machine Learning Models',\n",
       "   'question': 'How do I debug a docker container?',\n",
       "   'course': 'machine-learning-zoomcamp'}},\n",
       " {'_index': 'course-questions',\n",
       "  '_id': 'unTAEZcBPE51qpNEwD8s',\n",
       "  '_score': 66.688705,\n",
       "  '_source': {'text': \"You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\\nHrithik Kumar Advani\",\n",
       "   'section': '5. Deploying Machine Learning Models',\n",
       "   'question': 'How do I copy files from my local machine to docker container?',\n",
       "   'course': 'machine-learning-zoomcamp'}},\n",
       " {'_index': 'course-questions',\n",
       "  '_id': 'u3TAEZcBPE51qpNEwD9e',\n",
       "  '_score': 59.812744,\n",
       "  '_source': {'text': 'You can copy files from your local machine into a Docker container using the docker cp command. Here\\'s how to do it:\\nIn the Dockerfile, you can provide the folder containing the files that you want to copy over. The basic syntax is as follows:\\nCOPY [\"src/predict.py\", \"models/xgb_model.bin\", \"./\"]\\t\\t\\t\\t\\t\\t\\t\\t\\t\\t\\tGopakumar Gopinathan',\n",
       "   'section': '5. Deploying Machine Learning Models',\n",
       "   'question': 'How do I copy files from a different folder into docker container’s working directory?',\n",
       "   'course': 'machine-learning-zoomcamp'}}]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "search_results['hits']['hits']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "7ef298ee-70cc-4083-ab77-19f9d665bf7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "context_template = \"\"\"\n",
    "Q: {question}\n",
    "A: {text}\n",
    "\"\"\".strip()\n",
    "\n",
    "prompt_template = \"\"\"\n",
    "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n",
    "Use only the facts from the CONTEXT when answering the QUESTION.\n",
    "\n",
    "QUESTION: {question}\n",
    "\n",
    "CONTEXT:\n",
    "{context}\n",
    "\"\"\".strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "163fa4c5-1822-4770-8cdc-3614ddc359af",
   "metadata": {},
   "outputs": [],
   "source": [
    "context_pieces = []\n",
    "\n",
    "for hit in search_results['hits']['hits']:\n",
    "    doc = hit['_source']\n",
    "    context_piece = context_template.format(**doc)\n",
    "    context_pieces.append(context_piece)\n",
    "\n",
    "context = '\\n\\n'.join(context_pieces)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "3b5712cf-1b45-4a56-a7e2-8ff81dc850de",
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt = prompt_template.format(question=query, context=context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e92826bc-1d93-43a4-8c2c-8483009a6494",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1446"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "21bacfbd-b152-4173-8d03-055d5938cebe",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tiktoken"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "acae2085-c2d6-4e1f-98ce-3f039e2b3ac7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(prompt[:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "d512c5f2-3c73-4006-9b26-76819b88e3fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "encoding = tiktoken.encoding_for_model(\"gpt-4o\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "18fb5d96-f44a-4e7b-b076-97d390fedf6f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "320"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(encoding.encode(prompt))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "bdc331b5-a6a8-47ce-8769-bde1e71ee0ff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[63842, 261, 4165, 14029, 29186, 13, 30985, 290, 150339, 4122]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokens = encoding.encode(prompt)[:10]\n",
    "tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "cc0b5407-bccb-4e5a-8a6d-bd787c0d13f0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "b'.'"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoding.decode_single_token_bytes(tokens[5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96a66a30-38eb-4c9d-9fb1-e053335cde9d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
